{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "### Import Packages\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import elasticsearch\n",
    "import re\n",
    "import json\n",
    "from datetime import datetime\n",
    "from pprint import pprint\n",
    "import timeit\n",
    "\n",
    "# Define elasticsearch class\n",
    "es = elasticsearch.Elasticsearch()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "### Helper Functions\n",
    "# convert np.int64 into int. json.dumps does not work with int64\n",
    "class SetEncoder(json.JSONEncoder):\n",
    "    def default(self, obj):\n",
    "        if isinstance(obj, np.int64):\n",
    "            return np.int(obj)\n",
    "        # else\n",
    "        return json.JSONEncoder.default(self, obj)\n",
    "\n",
    "# Convert datestamp into ISO format    \n",
    "def str_to_iso(text):\n",
    "    if text != '':\n",
    "        for fmt in ('%Y-%m-%d %H:%M:%S.%f', '%Y-%m-%d %H:%M:%S', '%Y-%m-%d'):\n",
    "            try:\n",
    "                return datetime.isoformat(datetime.strptime(text, fmt))\n",
    "            except ValueError:\n",
    "                pass\n",
    "        raise ValueError('no valid date format found')\n",
    "    else:\n",
    "        return None\n",
    "    \n",
    "# Custom groupby function    \n",
    "def concatdf(x):\n",
    "    if len(x) > 1:  #if multiple values\n",
    "        return list(x)\n",
    "    else: #if single value\n",
    "        return x.iloc[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "### Import Data \n",
    "# Load projects, resources & donations data\n",
    "projects = pd.read_csv('./data/opendata_projects.csv', index_col=None, escapechar='\\\\')\n",
    "donations = pd.read_csv('./data/opendata_donations.csv', index_col=None, escapechar='\\\\')\n",
    "resources = pd.read_csv('./data/opendata_resources.csv', index_col=None, escapechar='\\\\' )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "### Data Cleanup\n",
    "# replace nan with ''\n",
    "projects = projects.fillna('') \n",
    "donations = donations.fillna('')\n",
    "resources = resources.fillna('')\n",
    "\n",
    "#  Clean up column names: remove _ at the start of column name\n",
    "donations.columns = donations.columns.map(lambda x: re.sub('^ ', '', x))\n",
    "donations.columns = donations.columns.map(lambda x: re.sub('^_', '', x))\n",
    "projects.columns = projects.columns.map(lambda x: re.sub('^_', '', x))\n",
    "resources.columns = resources.columns.map(lambda x: re.sub('^ ', '', x))\n",
    "resources.columns = resources.columns.map(lambda x: re.sub('^_', '', x))\n",
    "\n",
    "# Add quotes around projectid values to match format in projects / donations column\n",
    "resources['projectid'] = resources['projectid'].map(lambda x: '\"' + x +'\"')\n",
    "\n",
    "# Add resource_prefix to column names\n",
    "resources.rename(columns={'vendorid': 'resource_vendorid', 'vendor_name': 'resource_vendor_name', 'item_name': 'resource_item_name',\n",
    "       'item_number' :'resource_item_number', \"item_unit_price\": 'resource_item_unit_price',\n",
    "       'item_quantity': 'resource_item_quantity'}, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "### Merge multiple resource row per projectid into a single row\n",
    "# NOTE: section may take a few minutes to execute\n",
    "concat_resource = pd.DataFrame()\n",
    "gb = resources.groupby('projectid')\n",
    "\n",
    "start = timeit.timeit()\n",
    "for a in resources.columns.values:   \n",
    "    print(a)\n",
    "    concat_resource[a] = gb[a].apply(lambda x: concatdf(x))\n",
    "    #print(xx.index)\n",
    "    \n",
    "end = timeit.timeit()\n",
    "print(end - start)\n",
    "\n",
    "concat_resource['projectid'] = concat_resource.index;\n",
    "concat_resource.reset_index(drop=True);"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "### Rename Project columns\n",
    "projects.rename(columns=lambda x: \"project_\" + x, inplace=True)\n",
    "projects.rename(columns={\"project_projectid\": \"projectid\"}, inplace=True)\n",
    "projects.columns.values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#### Merge data into single frame\n",
    "data = pd.merge(projects, concat_resource, how='left', right_on='projectid', left_on='projectid') \n",
    "data = pd.merge(donations, data, how='left', right_on='projectid', left_on='projectid')\n",
    "data = data.fillna('')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#### Process columns\n",
    "# Modify date formats\n",
    "data['project_date_expiration'] = data['project_date_expiration'].map(lambda x: str_to_iso(x));\n",
    "data['project_date_posted'] = data['project_date_posted'].map(lambda x: str_to_iso(x))\n",
    "data['project_date_thank_you_packet_mailed'] = data['project_date_thank_you_packet_mailed'].map(lambda x: str_to_iso(x))\n",
    "data['project_date_completed'] = data['project_date_completed'].map(lambda x: str_to_iso(x))\n",
    "data['donation_timestamp'] = data['donation_timestamp'].map(lambda x: str_to_iso(x))\n",
    "\n",
    "# Create location field that combines lat/lon information\n",
    "data['project_location'] = data[['project_school_longitude','project_school_latitude']].values.tolist()\n",
    "del(data['project_school_latitude'])  # delete latitude field\n",
    "del(data['project_school_longitude']) # delete longitude"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "### Create and configure Elasticsearch index\n",
    "\n",
    "# Name of index and document type\n",
    "index_name = 'donorschoose';\n",
    "doc_name = 'donation'\n",
    "\n",
    "# Delete donorschoose index if one does exist\n",
    "if es.indices.exists(index_name):\n",
    "    es.indices.delete(index_name)\n",
    "\n",
    "# Create donorschoose index    \n",
    "es.indices.create(index_name)\n",
    "\n",
    "# Add mapping \n",
    "with open('donorschoose_mapping.json') as json_mapping:\n",
    "    d = json.load(json_mapping)\n",
    "\n",
    "es.indices.put_mapping(index=index_name, doc_type=doc_name, body=d)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "### Index Data into Elasticsearch\n",
    " \n",
    "for don_id, thisDonation in data.iterrows():\n",
    "    # print every 10000 iteration\n",
    "    if don_id % 10000 == 0:\n",
    "        print(don_id)\n",
    "    \n",
    "    thisDoc = json.dumps(thisDonation.to_dict(), cls=SetEncoder);\n",
    "    \n",
    "    # write to elasticsearch\n",
    "    es.index(index=index_name, doc_type=doc_name, id=thisDonation['donationid'], body=thisDoc)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    ""
   ]
  }
 ],
 "metadata": {
  "celltoolbar": "Raw Cell Format",
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3.0
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.4.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}